package org.curiosity.crawl;
import com.google.common.collect.ImmutableMap;
import org.curiosity.concept.Camera;
import org.curiosity.concept.Image;
import org.jsoup.Connection;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.util.List;
import java.util.Map;
/**
* A {@link WebCrawler} is responsible for crawling Curiosity data on NASA's website.
*
* This mainly exists to be a container for {@link ImageCrawler} configuration (should use configuration file instead).
*
* @author jherwitz
*/
public abstract class WebCrawler extends Crawler {
protected final String root = "mars.jpl.nasa.gov/msl/multimedia/raw/";
protected final Map<Camera, String> cameraSuffixes = ImmutableMap.of(Camera.FrontHazcam, "FHAZ_",
Camera.RearHazcam, "RHAZ_",
Camera.LeftNavcam, "NAV_LEFT_",
Camera.RightNavcam, "NAV_RIGHT_",
Camera.Mastcam, "MAST_");
protected final int imageListRequestTimeout = 10000; // milliseconds
protected final long backoffTime = 10000; //TODO: exponential backoff
protected final String spoofUserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36";
protected Document httpGet(Connection conn) {
try {
// TODO: execute network request in a separate thread pool
return conn.get();
} catch (IOException e) {
if(e instanceof HttpStatusException) {
HttpStatusException statusException = (HttpStatusException) e;
// sleep for backoffTime if we're asked to slow down
if(statusException.getStatusCode() == 503) {
try {
Thread.sleep(backoffTime);
} catch (InterruptedException e1) {
throw new RuntimeException(e1);
}
}
}
throw new RuntimeException(e);
}
}
}